library(magrittr)
library(tinytex)
library(ALSM)
## Loading required package: leaps
## Loading required package: SuppDists
## Loading required package: car
## Loading required package: carData
library(stringr)
library(stats)
#plotly Regression packages
library(reshape2) # to load tips data
library(tidymodels) # for the fit() function
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom 0.7.9 ✓ recipes 0.1.17
## ✓ dials 0.0.10 ✓ rsample 0.1.1
## ✓ dplyr 1.0.7 ✓ tibble 3.1.6
## ✓ ggplot2 3.3.5 ✓ tidyr 1.1.4
## ✓ infer 1.0.0 ✓ tune 0.1.6
## ✓ modeldata 0.1.1 ✓ workflows 0.2.4
## ✓ parsnip 0.1.7 ✓ workflowsets 0.1.0
## ✓ purrr 0.3.4 ✓ yardstick 0.0.9
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::set_names() masks magrittr::set_names()
## x purrr::some() masks car::some()
## x recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data(tips)
CA demographics dataset can be downloaded from: https://www.countyhealthrankings.org/app/california/2021/downloads
#main CA demographics dataset
demo <- read.csv(file="demographics.csv")
#main working df
df.demo <- demo[2:58,c(3,4,7,8,9,10,11,14,17,18,19,20,21,22,23)]
df.demo[,c(2,4,6,8,12)] <- df.demo[,c(2,4,6,8,12)] / 100 #change % columns to decimal
#main covid case data from jhu.edu
covid <- read.csv(file="CA-2020-cov-data.csv") %>% dplyr::rename(Cases = Case.Count.2020)
df.main <- left_join(df.demo, covid, by="County") %>% dplyr::select(Cases,County,-X,everything())
df.main <- df.main[,1:16]
plot(df.main)
Experiment 1: Full MLR
Y <- df.main$Cases -> Confirmed.COVID19.Cases
X1 <- df.main$NUM.FOOD.INSECURE -> Food.Insecurity
X2 <- df.main$PERCENT.ADULT.DIABETES -> Percent.Adult.Diabetes
X3 <- df.main$PERCENT.OBESITY -> Percent.Adult.Obesity
X4 <- df.main$PERCENT.LIMITED.ACCESS -> Access.to.Healthy.Food
X5 <- df.main$MEDIAN.HOUSEHOLD.INCOME -> Median.Household.Income
X6 <- df.main$AVG.DAILY.PM2.5 -> Air.Pollution
df.full <- cbind(Y,X1,X2,X3,X4,X5,X6) %>% as.data.frame
mod.full <- lm(Confirmed.COVID19.Cases~Food.Insecurity+Percent.Adult.Diabetes+Percent.Adult.Obesity+Access.to.Healthy.Food+Median.Household.Income+Air.Pollution, df.full)
summary(mod.full)
##
## Call:
## lm(formula = Confirmed.COVID19.Cases ~ Food.Insecurity + Percent.Adult.Diabetes +
## Percent.Adult.Obesity + Access.to.Healthy.Food + Median.Household.Income +
## Air.Pollution, data = df.full)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5750668 -480655 243852 590083 3111269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.816e+06 2.544e+06 -0.714 0.479
## Food.Insecurity 5.079e+01 1.468e+00 34.600 <2e-16 ***
## Percent.Adult.Diabetes -5.907e+05 8.855e+06 -0.067 0.947
## Percent.Adult.Obesity 4.058e+06 6.351e+06 0.639 0.526
## Access.to.Healthy.Food 6.625e+06 5.915e+06 1.120 0.268
## Median.Household.Income -2.057e+01 1.611e+01 -1.277 0.207
## Air.Pollution 1.010e+05 8.448e+04 1.196 0.237
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1557000 on 50 degrees of freedom
## Multiple R-squared: 0.9689, Adjusted R-squared: 0.9652
## F-statistic: 259.6 on 6 and 50 DF, p-value: < 2.2e-16
anova(mod.full)
## Analysis of Variance Table
##
## Response: Confirmed.COVID19.Cases
## Df Sum Sq Mean Sq F value Pr(>F)
## Food.Insecurity 1 3.7369e+15 3.7369e+15 1541.3574 < 2.2e-16 ***
## Percent.Adult.Diabetes 1 4.6145e+12 4.6145e+12 1.9033 0.173844
## Percent.Adult.Obesity 1 1.8402e+13 1.8402e+13 7.5902 0.008165 **
## Access.to.Healthy.Food 1 8.1325e+12 8.1325e+12 3.3544 0.072985 .
## Median.Household.Income 1 4.1984e+12 4.1984e+12 1.7317 0.194191
## Air.Pollution 1 3.4665e+12 3.4665e+12 1.4298 0.237433
## Residuals 50 1.2122e+14 2.4244e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova
## function (mod, ...)
## {
## UseMethod("Anova", mod)
## }
## <bytecode: 0x7fb1ceb8d080>
## <environment: namespace:car>
From the ANOVA results of the full MLR, it is clear further model refinement is required. Therefore, we plot the MLR model criteria with plotmodel.s function, part of the ALSM package. Additionally, the Best Subsets Algorithm (also from ALSM) is utilized to determine the best model for this experiment.
#Plot Criterias for Model Selection
plotmodel.s(df.full[,2:7],df.full$Y)
#Best Subsets Algorithm for Model Selection
bs.full <- BestSub(df.full[2:7],df.full$Y,num=1) %>% as.data.frame
bs.full
## p 1 2 3 4 5 6 SSEp r2 r2.adj Cp AICp SBCp
## 1 2 1 0 0 0 0 0 1.600337e+14 0.9589331 0.9581865 13.009519 1637.811 1641.897
## 2 3 1 0 0 0 1 0 1.289219e+14 0.9669168 0.9656915 2.176782 1627.489 1633.618
## 3 4 1 0 0 0 1 1 1.251742e+14 0.9678786 0.9660604 2.630948 1627.808 1635.980
## 4 5 1 0 0 1 1 1 1.224698e+14 0.9685725 0.9661550 3.515448 1628.563 1638.778
## 5 6 1 0 1 1 1 1 1.212309e+14 0.9688905 0.9658405 5.004449 1629.983 1642.241
## 6 7 1 1 1 1 1 1 1.212201e+14 0.9688932 0.9651604 7.000000 1631.978 1646.279
## PRESSp
## 1 4.359156e+14
## 2 3.441509e+14
## 3 4.293698e+14
## 4 4.277007e+14
## 5 4.270869e+14
## 6 4.295439e+14
#Which r^2 and r^2 adjusted are greatest?
r2.max <- bs.full$r2 %>% max
nrow(bs.full==r2.max)
## [1] 6
r2adj.max <- bs.full$r2.adj %>% max
nrow(bs.full==r2adj.max)
## [1] 6
Cp.min <- bs.full$Cp %>% min
nrow(bs.full==Cp.min)
## [1] 6
AIC.min <- bs.full$AICp %>% min
nrow(bs.full==AIC.min)
## [1] 6
SBC.min <- bs.full$SBCp %>% min
nrow(bs.full==SBC.min)
## [1] 6
PRESSp.min <- bs.full$PRESSp %>% min
nrow(bs.full==PRESSp.min)
## [1] 6
Experiment 2: Food insecurity as a factor for COVID-19 cases H0: b1 = 0 Ha: b1 ~= 0
Y <- df.main$Cases
X <- df.main$NUM.FOOD.INSECURE
df.fi <- cbind(X,Y) %>% as.data.frame
mod.fi <- lm(Y~X, df.fi)
summary(mod.fi)
##
## Call:
## lm(formula = Y ~ X, data = df.fi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6229243 -314569 401278 644393 3827475
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.690e+05 2.494e+05 -2.683 0.00962 **
## X 5.004e+01 1.396e+00 35.837 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1706000 on 55 degrees of freedom
## Multiple R-squared: 0.9589, Adjusted R-squared: 0.9582
## F-statistic: 1284 on 1 and 55 DF, p-value: < 2.2e-16
anova(mod.fi)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X 1 3.7369e+15 3.7369e+15 1284.3 < 2.2e-16 ***
## Residuals 55 1.6003e+14 2.9097e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot linear model data
plot(mod.fi)
#plot data with regression line
plot(X,Y,pch = 16, cex = 1.3, col = "blue", main = "Confirmed COVID-19 Cases vs. Food Insecurity", xlab = "Food Insecurity", ylab = "Confirmed COVID-19 Cases")
abline(mod.fi)
#Plotly
fig.fi <- plot_ly(df.fi, x = ~X, y = ~Y,
type = 'scatter',
alpha=0.65,
mode='markers',
name='COVID-19 Cases') %>%
add_lines(x=~X,y=fitted(mod.fi),name='Regression Line') %>%
layout(title='COVID-19 Cases vs. Food Insecurity',
xaxis = list(title='Number of Food Insecure Adults'),
yaxis = list(title='Confirmed COVID-19 Cases'))
fig.fi
Experiment 3: Comparing Food Insecurity vs. Health factors such as obesity and diabetes. H0: b1 = 0, b2 = 0, b3 = 0 Ha: not H0
Y <- df.main$Cases -> Confirmed.COVID19.Cases
X1 <- df.main$NUM.FOOD.INSECURE -> Food.Insecurity
X2 <- df.main$PERCENT.ADULT.DIABETES -> Percent.Adult.Diabetes
X3 <- df.main$PERCENT.OBESITY -> Percent.Adult.Obesity
X4 <- df.main$PERCENT.LIMITED.ACCESS -> Access.to.Healthy.Food
X5 <- df.main$MEDIAN.HOUSEHOLD.INCOME -> Median.Household.Income
X6 -> df.main$AVG.DAILY.PM2.5 -> Air.Pollution
df.fih <- cbind(Y,X1,X2,X3,X4,X5,X6) %>% as.data.frame
mod.fih <- lm(Confirmed.COVID19.Cases~Food.Insecurity+Percent.Adult.Diabetes+Percent.Adult.Obesity, df.fih)
summary(mod.fih)
##
## Call:
## lm(formula = Confirmed.COVID19.Cases ~ Food.Insecurity + Percent.Adult.Diabetes +
## Percent.Adult.Obesity, data = df.fih)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5765097 -386068 231993 793507 3300537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.895e+06 1.128e+06 -3.452 0.0011 **
## Food.Insecurity 5.112e+01 1.367e+00 37.402 <2e-16 ***
## Percent.Adult.Diabetes -4.952e+06 8.724e+06 -0.568 0.5727
## Percent.Adult.Obesity 1.325e+07 4.965e+06 2.668 0.0101 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1608000 on 53 degrees of freedom
## Multiple R-squared: 0.9648, Adjusted R-squared: 0.9628
## F-statistic: 484.8 on 3 and 53 DF, p-value: < 2.2e-16
anova(mod.fih)
## Analysis of Variance Table
##
## Response: Confirmed.COVID19.Cases
## Df Sum Sq Mean Sq F value Pr(>F)
## Food.Insecurity 1 3.7369e+15 3.7369e+15 1445.4658 <2e-16 ***
## Percent.Adult.Diabetes 1 4.6145e+12 4.6145e+12 1.7849 0.1873
## Percent.Adult.Obesity 1 1.8402e+13 1.8402e+13 7.1180 0.0101 *
## Residuals 53 1.3702e+14 2.5852e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod.fih)
#Plotly
fig.fih <- plot_ly(df.fih, x = ~X1+X2+X3, y = ~Y,
type='scatter',
alpha=0.65,
mode='markers',
name='COVID-19 Cases') %>%
add_lines(x=~X1+X2+X3,y=fitted(mod.fih),name='Regression Line') %>%
layout(title='COVID-19 Cases vs. Food Insecurity+Obesity+Diabetes',
xaxis = list(title='Food Insecurity + Obesity + Diabetes'),
yaxis = list(title='Confirmed COVID-19 Cases'))
fig.fih
Experiment 3: Unemployment as a factor for COVID-19 cases H0: b1 = 0 Ha: b1 ~= 0
Y <- df.main$Cases
X <- df.main$NUM.UNEMPLOYED
df <- cbind(X,Y) %>% as.data.frame
mod.unemp <- lm(Y~X, df)
summary(mod.unemp)
##
## Call:
## lm(formula = Y ~ X, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2273821 -301636 188771 371340 3418308
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.552e+05 1.285e+05 -3.542 0.000818 ***
## X 2.499e+02 3.584e+00 69.730 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 890200 on 55 degrees of freedom
## Multiple R-squared: 0.9888, Adjusted R-squared: 0.9886
## F-statistic: 4862 on 1 and 55 DF, p-value: < 2.2e-16
anova(mod.unemp)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X 1 3.8533e+15 3.8533e+15 4862.2 < 2.2e-16 ***
## Residuals 55 4.3588e+13 7.9250e+11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot linear model data
plot(mod.unemp)
#Plotly
fig.unemp <- plot_ly(df, x = ~X, y = ~Y,
type='scatter',
alpha=0.65,
mode='markers',
name='COVID-19 Cases') %>%
add_lines(x=~X,y=fitted(mod.unemp),name='Regression Line') %>%
layout(title='COVID-19 Cases vs. Unemployment',
xaxis = list(title='Number of Unemployed Adults'),
yaxis = list(title='Confirmed COVID-19 Cases'))
fig.unemp